Project Set-up

#Load Packages
library(tidyr)
library(leaflet)
library(ggplot2)
library(dplyr)
library(sf)
library(readxl)
library(sp)
library(htmltools)
library(RColorBrewer)
library(tidygeocoder)
library(tidycensus)
library(tigris)
library(units)
library(ggfortify)
library(broom)
library(lubridate)
library(car)

#Set Working Directory
setwd("/Users/sarahhoffman/Library/CloudStorage/OneDrive-UniversityofMaryland/data_sources")

School Location

#Create Data Table of School Locations and limit columns
public.schools <- read.csv("school_locations.csv") %>%
                  select(NAME, LATITUDE, LONGITUDE)
#Create Data Table of Charter Schools
charter.schools <- read.csv("charter_schools.csv") %>%
                  geocode(Address, method = 'osm', lat = latitude, long = longitude) %>% #Convert addresses to coordinates
                  select(charter.latlong, School.Name, latitude, longitude) %>% 
#Limit columns
colnames(charter.schools) <- c("NAME", "LATITUDE", "LONGITUDE") #Change column names

#Manually input NA data
charter.schools[59, 2] <- 38.85632429624879
charter.schools[59, 3] <- -76.98913290464318
charter.schools[69, 2] <- 38.83197915527436
charter.schools[69, 3] <- -77.01854284512433

#Combine public and charter schools
school.locations <- rbind(public.locations, charter.locations)

#Download as csv file for future use
write.csv(school.locations, "/Users/sarahhoffman/Library/CloudStorage/OneDrive-UniversityofMaryland/data_sources/schoolcoordinates.csv", row.names = TRUE)
#Load school locations
school.locations <- read.csv("schoolcoordinates.csv")

#Create school radius
school.radius <- school.locations %>%
                st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) %>%
                sf::st_buffer (dist = 402)

head(school.radius)
## Simple feature collection with 6 features and 2 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -77.0452 ymin: 38.83486 xmax: -76.97839 ymax: 38.95189
## Geodetic CRS:  WGS 84
##   X                              NAME                       geometry
## 1 1    Amidon-Bowen Elementary School POLYGON ((-77.01547 38.8824...
## 2 2             Anacostia High School POLYGON ((-76.98737 38.8715...
## 3 3                Ballou High School POLYGON ((-76.99684 38.8394...
## 4 4        Bancroft Elementary School POLYGON ((-77.03796 38.9373...
## 5 5 Bard High School Early College DC POLYGON ((-76.98766 38.8415...
## 6 6         Barnard Elementary School POLYGON ((-77.01614 38.9516...

Crash Data

#Create Data Table of Crashes Involving Pedestrians
crash <- read_xlsx(path = "crashes_19_23.xlsx", sheet = "crashes_19_23_ped") %>%
   #Convert crash coordinates to sf           
  st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) 

#Match crashes to school
crash_filter <- st_filter(crash, school.radius) %>% #Filter to those that fall within radius
                      st_join(left = TRUE, school.radius["NAME"]) 
  
#Transform report date column to as.POSIXct
crash_filter$REPORTDATE <- as.POSIXct(crash_filter$REPORTDATE, format = "%Y/%m/%d %H:%M:%S")
  
#Create columns for year, month, day of week, and hours of day
crash_filter <- crash_filter %>%
                mutate(year = year(crash_filter$REPORTDATE), 
                       month = month(crash_filter$REPORTDATE, label = TRUE), 
                       day = wday(REPORTDATE, label = TRUE), 
                       hour =  hour(REPORTDATE))

head(crash_filter)
## Simple feature collection with 6 features and 69 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -77.02971 ymin: 38.89882 xmax: -76.97417 ymax: 38.92377
## Geodetic CRS:  WGS 84
## # A tibble: 6 × 70
##           X        Y  CRIMEID     CCN REPORTDATE          ROUTEID MEASURE OFFSET
##       <dbl>    <dbl>    <dbl>   <dbl> <dttm>              <chr>     <dbl>  <dbl>
## 1 -8568749. 4707150. 27906900  1.82e7 2019-01-01 01:06:54 120153…    824.   6.46
## 2 -8568749. 4707150. 27906900  1.82e7 2019-01-01 01:06:54 120153…    824.   6.46
## 3 -8568749. 4707150. 27906900  1.82e7 2019-01-01 01:06:54 120153…    824.   6.46
## 4 -8568749. 4707150. 27906900  1.82e7 2019-01-01 01:06:54 120153…    824.   6.46
## 5 -8574908. 4710759. 27906921  1.82e7 2019-01-01 01:53:57 110338…    626.   0.07
## 6 -8574908. 4710759. 27906921  1.82e7 2019-01-01 01:53:57 110338…    626.   0.07
## # ℹ 62 more variables: STREETSEGID <dbl>, ROADWAYSEGID <dbl>, FROMDATE <chr>,
## #   TODATE <lgl>, ADDRESS <chr>, XCOORD <dbl>, YCOORD <dbl>, WARD <chr>,
## #   EVENTID <chr>, MAR_ADDRESS <chr>, MAR_SCORE <dbl>,
## #   MAJORINJURIES_BICYCLIST <dbl>, MINORINJURIES_BICYCLIST <dbl>,
## #   UNKNOWNINJURIES_BICYCLIST <dbl>, FATAL_BICYCLIST <dbl>,
## #   MAJORINJURIES_DRIVER <dbl>, MINORINJURIES_DRIVER <dbl>,
## #   UNKNOWNINJURIES_DRIVER <dbl>, FATAL_DRIVER <dbl>, …
#Count number of crashes, number of pedestrians, and number of major and fatal injuries
crash_school <- crash_filter %>%                      
  group_by(NAME) %>%
  summarise(total_crashes = n(), 
            total_pedestrians = sum(TOTAL_PEDESTRIANS), 
            maj_fat = sum(MAJORINJURIES_PEDESTRIAN, FATAL_PEDESTRIAN))

#Join crash_school data with school location
school.crash <- school.locations %>% 
  full_join(crash_school, by = c("NAME"), suffix=c("",".y")) %>%
  full_join(school.radius, by = c("NAME"), suffix=c("",".y"))

#Change NA to 0
school.crash <- school.crash %>%
  mutate_at(vars(total_crashes, total_pedestrians, maj_fat), ~replace_na(., 0))

head(school.crash)
##   X                              NAME LATITUDE LONGITUDE total_crashes
## 1 1    Amidon-Bowen Elementary School 38.87952 -77.01813            14
## 2 2             Anacostia High School 38.87008 -76.98308            27
## 3 3                Ballou High School 38.83851 -77.00136             6
## 4 4        Bancroft Elementary School 38.93432 -77.04055            18
## 5 5 Bard High School Early College DC 38.84498 -76.98615            21
## 6 6         Barnard Elementary School 38.94823 -77.01778             2
##   total_pedestrians maj_fat                       geometry X.y
## 1                15       1 MULTIPOINT ((-77.02256 38.8...   1
## 2                32       1 MULTIPOINT ((-76.97855 38.8...   2
## 3                 7       2 MULTIPOINT ((-77.00565 38.8...   3
## 4                22       2 MULTIPOINT ((-77.03831 38.9...   4
## 5                24       4 MULTIPOINT ((-76.98557 38.8...   5
## 6                 2       0 MULTIPOINT ((-77.02107 38.9...   6
##                       geometry.y
## 1 POLYGON ((-77.01547 38.8824...
## 2 POLYGON ((-76.98737 38.8715...
## 3 POLYGON ((-76.99684 38.8394...
## 4 POLYGON ((-77.03796 38.9373...
## 5 POLYGON ((-76.98766 38.8415...
## 6 POLYGON ((-77.01614 38.9516...

Crash Graphs

#Look at crashes by year
crashes_year <- crash_filter %>%
  group_by(year) %>%
  summarize(total_crashes = n(), .groups = 'drop') 

crashes_year_label <- format(round(crashes_year$total_crashes, digits = 0),big.mark=",")

ggplot(crashes_year, aes(year, total_crashes)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Number of Pedestrian Involved Crashes by Year", 
       x = "Year", 
       y = "Number of Crashes") +
  geom_text(aes(label=crashes_year_label),  position=position_dodge(width=0.9), vjust=-0.25) 

#Look at top schools
crashes_school <- crash_filter %>%
  group_by(NAME) %>%
  summarize(total_crashes = n(), .groups = 'drop') %>%
  arrange(desc(total_crashes)) %>%
  slice_head(n = 10)

crashes_school <- crashes_school %>%
                  st_drop_geometry(.)

crashes_school_radius <- school.radius %>%
                        right_join(crashes_school, by = "NAME")

crashes_school_label <- format(round(crashes_school$total_crashes, digits = 0),big.mark=",")

ggplot(crashes_school, aes(total_crashes, reorder(NAME, total_crashes))) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Top 10 Schools for \nPedestrian Involved Crashes", 
       x = "Total Crashes", 
       y = "School") +
  scale_x_continuous(limits = c(0,130)) +
  geom_text(aes(label=crashes_school_label),  position=position_dodge(width=0.9), hjust=-0.25, size = 3) 

dc.outline <- counties(state = "DC", year = 2022)
dc.roads <- roads(state = "DC", county = "District of Columbia", year = 2022)
ggplot() +
  geom_sf(data = dc.outline, color = "grey", fill = "white") +
  geom_sf(data = dc.roads, color = "grey") +
  geom_sf(data = crashes_school_radius, size = 5, color = "blue", alpha = 0.5) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(), 
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
  ) +
  labs(title = "Top 10 Schools for Pedestrian Involved Crashes") 

ggplot() +
  geom_sf(data = dc.roads, color = "grey", fill = "white") +
  geom_sf(data = crashes_school_radius, size = 5, color = "blue", alpha = 0.5) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(), 
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
  ) +
  scale_x_continuous(limits = c(-77.05,-77.00)) +
  scale_y_continuous(limits = c(38.87,38.95)) +
  labs(title = "Top 10 Schools for \nPedestrian Involved Crashes") 

#Look at top schools for crashes by year
crashes_year_school <- crash_filter %>%
  group_by(year, NAME) %>%
  summarize(total_crashes = n(), .groups = 'drop') %>%
  arrange(year, desc(total_crashes)) %>%
  group_by(year) %>%
  slice_head(n = 5)  %>% 
  mutate (ToHighlight = ifelse(total_crashes == max(total_crashes), "yes", "no"))

school_data <- format(round(crashes_year_school$total_crashes, digits = 0),big.mark=",")

ggplot(crashes_year_school, aes(y = NAME, x = total_crashes, fill = ToHighlight)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ year, scales = "fixed", nrow = 1) +  # Create a separate plot for each year
  geom_text(aes(label=school_data),  position=position_dodge(width=1), vjust=-1, hjust=-0.25, size = 3) + #Add data labels
  labs(title = "Top 5 Schools for \nNumber of Pedestrian-Involved \nCrashes by Year", 
       x = "Total Crashes", 
       y = "School") +
  scale_x_continuous(limits = c(0,50), breaks = seq(0, 50, by = 5)) +
  theme(axis.text.y = element_text( hjust = 1), 
        axis.line.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks = element_blank(), 
        legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())  

#Look at crashes by month
crashes_month <- crash_filter %>%
  group_by(month) %>%
  summarize(total_crashes = n(), .groups = 'drop') 

crashes_month_label <- format(round(crashes_month$total_crashes, digits = 0),big.mark=",")

ggplot(crashes_month, aes(month, total_crashes)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Number of Pedestrian Involved Crashes by Month", 
       x = "Month", 
       y = "Number of Crashes") +
  geom_text(aes(label=crashes_month_label),  position=position_dodge(width=0.9), vjust=-0.25) 

#Look at crashes by day of week
crashes_day <- crash_filter %>%
  group_by(day) %>%
  summarize(total_crashes = n(), .groups = 'drop') 

crashes_day_label <- format(round(crashes_day$total_crashes, digits = 0),big.mark=",")

ggplot(crashes_day, aes(day, total_crashes)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Number of Pedestrian Involved Crashes by Day of Week", 
       x = "Day of Week", 
       y = "Number of Crashes") +
  geom_text(aes(label=crashes_day_label),  position=position_dodge(width=0.9), vjust=-0.25) 

#Look at crashes by hour of day
crashes_hour <- crash_filter %>%
  group_by(hour) %>%
  summarise(total_crashes = n()) %>%
  arrange(hour)

hour_labels <- c("12", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11")
crashes_hour_label <- format(round(crashes_hour$total_crashes, digits = 0),big.mark=",")

ggplot(crashes_hour, aes(x = factor(hour), y = total_crashes)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Number of Pedestrian Involved Crashes by Hour of the Day",
       x = "Hour of Day",
       y = "Number of Crashes") +
  geom_text(aes(label=crashes_hour_label),  position=position_dodge(width=0.9), vjust=-0.25, size = 3) +
  theme_minimal() +
  scale_x_continuous(breaks = 0:23) + # Set x-axis to show all hours
  scale_x_discrete(labels = hour_labels) +
  theme(
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

Tree Location Data

#Load tree location data
tree <- read.csv("number_trees.csv") %>%
        filter(!is.na(X)) %>%
        filter(!is.na(Y)) %>%
        sf::st_as_sf(coords = c("X", "Y"), crs = 26985) %>% #Convert x and y coordinates to longitude and latitude
        st_transform(crs = 4326) %>%
        mutate(lat= st_coordinates(.)[,1],
        lon = st_coordinates(.)[,2]) %>%
        st_as_sf(coords = c("longitude", "latitude"), crs = 4326) #Convert back to sf

#Switch longitude and latitude columns
colnames(tree)[54] <- paste("longitude")
colnames(tree)[55] <- paste("latitue")

#Filter those trees that fall within schools 
tree.school <- tree %>%
              st_filter(school.radius) %>%
              st_join(left = TRUE, school.radius["NAME"]) %>%
              group_by(NAME) %>%
              summarise(total_trees = n(), avg_crown = mean(CROWN_AREA, na.rm = TRUE)) #Count number of trees and average crown area

#Combine school location data with tree data
school.crash.tree <- school.crash %>% 
  full_join(tree.school, by = "NAME", suffix=c("",".y")) %>%
  select(-ends_with(".y")) %>%
  mutate(avg_crown = round(avg_crown, 0))

#Look at top ten schools for tree density
treed_school <- school.crash.tree %>%
  arrange(desc(total_trees)) %>%
  slice_head(n = 10)

treed_school_label <- format(round(treed_school$total_trees, digits = 0),big.mark=",")

ggplot(treed_school, aes(total_trees, reorder(NAME, total_trees))) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Top 10 Schools for \nTree Density", 
       x = "Total Trees", 
       y = "School") +
  scale_x_continuous(limits = c(0,2000)) +
  geom_text(aes(label=treed_school_label),  position=position_dodge(width=0.9), hjust=-0.25, size = 3)

treed_school <- treed_school %>%
                  st_drop_geometry(.)

treed_school_radius <- school.radius %>%
                        right_join(treed_school, by = "NAME")
ggplot() +
  geom_sf(data = dc.outline, color = "grey", fill = "white") +
  geom_sf(data = dc.roads, color = "grey") +
  geom_sf(data = treed_school_radius, size = 5, color = "blue", alpha = 0.5) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(), 
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
  ) +
  labs(title = "Top 10 Schools for Tree Density")

Tree Coverage Data

#Create data table of tree coverage data
tree.coverage <- read.csv("treecanopy_20_all.csv") %>%
  select(GEOID, UTC_PCT) %>%
  transform(GEOID = as.character(GEOID))

#Update block groups to match 2022
crosswalk <- read.csv("crosswalk.csv") %>%
              select(bg2010ge, bg2020ge) %>%
              transform(bg2010ge = as.character(bg2010ge), 
                        bg2020ge = as.character(bg2020ge))
#Get block group geometries
dc.blocks <- block_groups(state = "DC", year = 2010)
#Join block group geometries with tree coverage data
tree.coverage.geometry <- tree.coverage %>%
  full_join(dc.blocks, by = c("GEOID" = "GEOID10")) %>%
  left_join(crosswalk, by = c("GEOID" = "bg2010ge")) %>%
  st_as_sf(., crs = 4326)

#Match tree coverage tracts to schools
tree.coverage.match <- st_join(school.radius, tree.coverage.geometry, largest = TRUE)

#Combine tree coverage with crash and tree count data
school.cr.tr.co <- school.crash.tree %>%
  full_join(tree.coverage.match, by = "NAME", suffix=c("",".y")) %>%
  select(-ends_with(".y")) 
school.cr.tr.co$bg2020ge <- as.character(school.cr.tr.co$bg2020ge)

#Look at top 10 schools for tree coverage
treec_school <- school.cr.tr.co %>%
  arrange(desc(UTC_PCT)) %>%
  slice_head(n = 10)

ggplot(treec_school, aes(UTC_PCT, reorder(NAME, UTC_PCT))) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Top 10 Schools for \nTree Coverage", 
       x = "Percent Urban Tree Coverage", 
       y = "School") +
  scale_x_continuous(limits = c(0,100)) +
  geom_text(aes(label=UTC_PCT),  position=position_dodge(width=0.9), hjust=-0.25, size = 3)

treec_school <- treec_school %>%
                  st_drop_geometry(.)

treec_school_radius <- school.radius %>%
                        right_join(treec_school, by = "NAME")
ggplot() +
  geom_sf(data = dc.outline, color = "grey", fill = "white") +
  geom_sf(data = dc.roads, color = "grey") +
  geom_sf(data = treec_school_radius, size = 5, color = "blue", alpha = 0.5) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(), 
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
  ) +
  labs(title = "Top 10 Schools for Tree Coverage")

Median Income and Population Data

#Set Census Api Key
census_api_key("9ac1cc1ce7936fa370a67b3f87d37169566116f6")
#Get population and median income data for block groups
block.data <- get_acs(geography = "block group",
                     state = "DC",
                     year = 2022,
                     survey = "acs5", 
                     output = "wide",
                     variables = c(medincome = "B19013_001", pop = "B01003_001"),
                     geometry = TRUE) %>%
                    mutate(area = st_area(.))
#Check missing values
colSums(is.na(block.data))
##      GEOID       NAME medincomeE medincomeM       popE       popM   geometry 
##          0          0         83        115          0          0          0 
##       area 
##          0
#Isolate rows with missing med income
block.data.inc.na <- block.data[is.na(block.data$medincomeE),]

#Load variables on household count by income category
inc.variables <- c("10" = "B19001_002", "10.15" = "B19001_003", 
                   "15.20" = "B19001_004", "20.25" = "B19001_005", "25.30" = "B19001_006", 
                   "30.35" = "B19001_007", "35.40" = "B19001_008", "40.45" = "B19001_009", 
                   "45.50" = "B19001_010", "50.60" = "B19001_011", "60.75" = "B19001_012", 
                   "75.100" = "B19001_013", "100.125" = "B19001_014", "125.150" = "B19001_015", 
                   "150.200" = "B19001_016", "200" = "B19001_017")

block.inc <- get_acs(geography = "block group",
                      state = "DC",
                      year = 2022,
                      survey = "acs5", 
                      output = "tidy",
                      variables = inc.variables) 

block.inc$GEOID <- as.numeric(block.inc$GEOID)
block.inc$variable <- as.numeric(block.inc$variable)

#Find 50th percentile income category
inc.imp <- block.inc %>%
          group_by(GEOID) %>%
          summarise(med = median(rep(variable,estimate), na.rm = TRUE))

inc.imp$GEOID <- as.character(inc.imp$GEOID)

#Impute estimate of median income based on range
inc.imp <- inc.imp %>%
          mutate(medincomeE = ifelse(med == "10", 10000, ifelse(med == "10.15", 12500, ifelse(med == "15.20", 17500, ifelse(med == "20.25", 22500, ifelse(med == "25.30", 27500, ifelse(med == "30.35", 32500, ifelse(med == "35.40", 37500, ifelse(med == "40.45", 42500, ifelse(med == "45.50", 47500, ifelse(med == "50.60", 55000, ifelse(med == "60.75", 67500, ifelse(med == "75.100", 87500, ifelse(med == "100.125", 125000, ifelse(med == "125.150", 137500, ifelse(med == "150.200", 175000, 200000))))))))))))))))

#Apply imputed values to original dataset
block.data.inc.imp <- block.data.inc.na %>%
                      left_join(inc.imp, by = "GEOID", suffix=c(".x","")) %>%
                      select(GEOID, medincomeE) %>%
                      st_drop_geometry(.)

block.data <- block.data %>%
              left_join(block.data.inc.imp, by = "GEOID") 

block.data$medincomeE.y <- ifelse(is.na(block.data$medincomeE.y), block.data$medincomeE.x, block.data$medincomeE.y)              

block.data <- block.data %>%
              select(-medincomeE.x)

#Check missing values again
colSums(is.na(block.data))
##        GEOID         NAME   medincomeM         popE         popM         area 
##            0            0          115            0            0            0 
## medincomeE.y     geometry 
##           21            0
#Calculate population density and proportion of population

block.data$area <- set_units(block.data$area, km^2)
block.data <- block.data %>%
              mutate(density = (popE/area)/1000) %>%
              mutate(pop_prop = popE/sum(popE))

#Join population and median income with combined data 
school.cr.tr.co.ce <- school.cr.tr.co %>%
  left_join(block.data, by = c("bg2020ge" = "GEOID"))

Road Classification, Speed limit, and Traffic Volume Data

#Load road classification data
road <- read.csv("roadway_classification.csv") %>%
              select(SPEEDLIMITS_OB, DCFUNCTIONALCLASS, ADDRESS)
#Geocode road coordinates
road <- road %>%
        geocode(ADDRESS, method = "osm", lat = latitude, lon = longitude)

#Save csv file for later use
st_write(road, "/Users/sarahhoffman/Library/CloudStorage/OneDrive-UniversityofMaryland/data_sources/roadcoordinates.csv")
#Load in csv file
road <- read.csv("roadcoordinates.csv")

#Omit NA values and convert to sf
road.clean <- na.omit(road) %>%
            st_as_sf(coords = c("longitude", "latitude"), crs = 4326)

#Change 0 values for AADT to NA
road.clean$AADT[road.clean$AADT == 0] <- NA

#Create column to count major arterial and minor arterial roads
road.clean <- road.clean %>%
              mutate(arterial = ifelse(DCFUNCTIONALCLASS == 14, TRUE, ifelse(DCFUNCTIONALCLASS ==16, TRUE, FALSE)))

#Filter and join roads to school radius
road.school <- st_filter(road.clean, school.radius) %>% #Filter to those that fall within radius
  st_join(left = TRUE, school.radius["NAME"]) %>% #Match crashes to a school
  group_by(NAME) %>%
  summarise(avg_speed = mean(SPEEDLIMITS_OB), n_roads = n(), n_arterial = sum(arterial), prop_arterial = n_arterial/n_roads, total_vol = sum(AADT, na.rm = TRUE)) #Calculate average speed limit, number and proportion of arterial roads, and sum of AADT
road.school$total_vol[road.school$total_vol == 0] <- NA

#Join with school data set
school.cr.tr.co.ce.rd <- school.cr.tr.co.ce %>% 
  full_join(road.school, by = c("NAME.x" = "NAME"), suffix=c("",".y")) 

#Select columns
school.data <- school.cr.tr.co.ce.rd %>%
              select(c(NAME.x, total_crashes, total_pedestrians, maj_fat, total_trees, avg_crown, UTC_PCT, GEOID, bg2020ge, medincomeE.y, medincomeM, popE, popM, geometry.y, density, pop_prop, avg_speed, n_roads, n_arterial, prop_arterial, total_vol))

#Create column that calculates proportion of pedestrians involved that had major or fatal injuries
school.data <- school.data %>%
              mutate(prop_majfat = maj_fat/total_pedestrians) 

#Change NA Values to O
school.data$prop_majfat[is.na(school.data$prop_majfat)] <- 0

Safe Routes to School Data

#Load in SRTS data 
school.srts <- read.csv("SRTS.csv")

#Join SRTS data with school.data
school.data <- school.data %>%
  left_join(school.srts, by = "NAME.x")

Log transformation

#Add log transformations for total crashes, median income, and proportion of major/fatal injuries
school.data <- school.data %>%
              mutate(log_crash = log(total_crashes),
                     log.income = log(medincomeE.y),
                     log_majfat = log(prop_majfat))

#Change -Inf values to 0
school.data$log_crash[school.data$log_crash == -Inf] <- 0

#Change -Inf values to -4
school.data$log_majfat[!is.finite(school.data$log_majfat)] <- -4

Summary Statistics

#Total Crashes
summary(school.data$total_crashes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.00   15.00   20.99   31.00  115.00
boxplot(school.data$total_crashes)

#Total Pedestrians Involved
summary(school.data$total_pedestrians)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.00   16.00   22.45   32.00  128.00
boxplot(school.data$total_pedestrians)

#Number of major or fatal injuries
summary(school.data$maj_fat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   2.745   4.000  15.000
boxplot(school.data$maj_fat)

#Proportion of major or fatal injuries
summary(school.data$prop_majfat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.04734 0.11111 0.12807 0.17167 0.75000
boxplot(school.data$prop_majfat)

#Total Trees
summary(school.data$total_trees)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    16.0   710.2   989.0   944.7  1164.2  1590.0
boxplot(school.data$total_trees)

#Average crown area
summary(school.data$avg_crown)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   209.0   638.8   758.0   762.8   875.0  1508.0
boxplot(school.data$avg_crown)

#Percent Urban Tree Coverage
summary(school.data$UTC_PCT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.70   19.46   25.61   28.12   32.93   72.95
boxplot(school.data$UTC_PCT)

#Median Income
summary(school.data$medincomeE.y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12500   59114  102896  113899  153897  250001      10
boxplot(school.data$medincomeE.y)

#Population
summary(school.data$popE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     926    1409    1344    1762    3385
boxplot(school.data$popE)

#Average Speed Limit
summary(school.data$avg_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   20.81   21.46   21.62   22.21   25.95
boxplot(school.data$avg_speed)

#Number of Roads
summary(school.data$n_roads)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   38.00   49.00   48.41   59.00   92.00
boxplot(school.data$n_roads)

#Proportion Arterial
summary(school.data$prop_arterial)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1386  0.2375  0.2528  0.3487  0.9348
boxplot(school.data$prop_arterial)

#Traffic Volume
summary(school.data$total_vol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    6136   69825  136250  162229  226024  611373       8
boxplot(school.data$total_vol)

#Density
summary(school.data$density)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.364   4.778   6.170   7.119  41.350
boxplot(school.data$density)

#Standard Deviation
school.data.sd <- school.data %>%
  select(total_crashes, total_pedestrians, maj_fat, prop_majfat, total_trees,
         avg_crown, UTC_PCT, medincomeE.y, popE,
         avg_speed, n_roads, prop_arterial, total_vol) %>%
  apply(2, sd, na.rm = TRUE)

school.data.sd
##     total_crashes total_pedestrians           maj_fat       prop_majfat 
##      1.863234e+01      1.982786e+01      2.852337e+00      1.214837e-01 
##       total_trees         avg_crown           UTC_PCT      medincomeE.y 
##      3.005419e+02      1.812980e+02      1.309376e+01      6.385593e+04 
##              popE         avg_speed           n_roads     prop_arterial 
##      6.001058e+02      1.095409e+00      1.628168e+01      1.696150e-01 
##         total_vol 
##      1.150898e+05

Histogram charts

#Histogram of number of pedestrian involved crashes
ggplot(school.data, aes(x = total_crashes)) +
  geom_histogram(binwidth = 5, color = "darkgrey", fill = "grey") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  scale_x_continuous(breaks = seq(0,120,10)) +
  labs(title = "Distribution of Pedestrian \nInvolved Crashes in School Radius", 
       x = "Total Number of Pedestrian Involved Crashes", 
       y = "Count") +
  geom_vline(aes(xintercept = mean(total_crashes)), color = "blue", alpha = 0.7) +
  geom_vline(aes(xintercept = median(total_crashes)), color = "darkorange", alpha = 0.7) +
  geom_text(aes(x=mean(total_crashes), label="\nMean", y=40), colour="blue", angle=90) +
  geom_text(aes(x=median(total_crashes), label="Median\n", y=40), colour="darkorange", angle=90)

#Histogram of log of pedestrian involved crashes
ggplot(school.data, aes(x = log_crash)) +
  geom_histogram(binwidth = 0.5, color = "darkgrey", fill = "grey") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Distribution of Pedestrian \nInvolved Crashes in School Radius (log)", 
       x = "Total Number of Pedestrian Involved Crashes (log)", 
       y = "Count") +
  scale_x_continuous(breaks = seq(0,5,0.5)) +
  geom_vline(aes(xintercept = mean(log_crash)), color = "blue", alpha = 0.7) +
  geom_vline(aes(xintercept = median(log_crash)), color = "darkorange", alpha = 0.7) +
  geom_text(aes(x=mean(log_crash), label="Mean\n", y=10), color="blue", angle=90) +
  geom_text(aes(x=median(log_crash), label="\nMedian", y=10), color="darkorange", angle=90)

#Histogram of proportion of major or fatal injuries
ggplot(school.data, aes(x = prop_majfat)) +
  geom_histogram(binwidth = 0.1, color = "darkgrey", fill = "grey") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Distribution of the Proportion \nof Major or Fatal Injuries", 
       x = "Proportion of Major or Fatal Injuries", 
       y = "Count") +
  scale_x_continuous(breaks = seq(0,1,0.1)) +
  geom_vline(aes(xintercept = mean(prop_majfat)), color = "blue", alpha = 0.7) +
  geom_vline(aes(xintercept = median(prop_majfat)), color = "darkorange", alpha = 0.7) +
  geom_text(aes(x=mean(prop_majfat), label="\nMean", y=50), color="blue", angle=90) +
  geom_text(aes(x=median(prop_majfat), label="Median\n", y=50), color="darkorange", angle=90)

#Histogram of proportion of major or fatal injuries (log)
ggplot(school.data, aes(x = log_majfat)) +
  geom_histogram(binwidth = 0.5, color = "darkgrey", fill = "grey") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Distribution of the Proportion \nof Major or Fatal Injuries (log)", 
       x = "Proportion of Major or Fatal Injuries (log)", 
       y = "Count") +
  scale_x_continuous(breaks = seq(-4,0,0.5)) +
  geom_vline(aes(xintercept = mean(log_majfat)), color = "blue", alpha = 0.7) +
  geom_vline(aes(xintercept = median(log_majfat)), color = "darkorange", alpha = 0.7) +
  geom_text(aes(x=mean(log_majfat), label="Mean\n", y=50), color="blue", angle=90) +
  geom_text(aes(x=median(log_majfat), label="\nMedian", y=50), color="darkorange", angle=90)

#Histogram of tree density
ggplot(school.data, aes(x = total_trees)) +
  geom_histogram(binwidth = 50, color = "darkgrey", fill = "grey") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  scale_x_continuous(breaks = seq(0,2000,250)) +
  labs(title = "Distribution of Tree Density", 
       x = "Number of Trees", 
       y = "Count") +
  geom_vline(aes(xintercept = mean(total_trees)), color = "blue", alpha = 0.7) +
  geom_vline(aes(xintercept = median(total_trees)), color = "darkorange", alpha = 0.7) +
  geom_text(aes(x=mean(total_trees), label="Mean\n", y=5), color="blue", angle=90) +
  geom_text(aes(x=median(total_trees), label="\nMedian", y=5), color="darkorange", angle=90)

#Histogram of tree coverage
ggplot(school.data, aes(x = UTC_PCT)) +
  geom_histogram(binwidth = 5, color = "darkgrey", fill = "grey") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  scale_x_continuous(breaks = seq(0,80,10)) +
  scale_y_continuous(breaks = seq(0,60,10)) +
  labs(title = "Distribution of Percent Urban Tree Coverage", 
       x = "Percent Urban Tree Coverage", 
       y = "Count") +
  geom_vline(aes(xintercept = mean(UTC_PCT)), color = "blue", alpha = 0.7) +
  geom_vline(aes(xintercept = median(UTC_PCT)), color = "darkorange", alpha = 0.7) +
  geom_text(aes(x=mean(UTC_PCT), label="\nMean", y=10), color="blue", angle=90) +
  geom_text(aes(x=median(UTC_PCT), label="Median\n", y=10), color="darkorange", angle=90)

Linear Regression Models

#Check correlation between independent variables
correlation_matrix <- cor(school.data[, c("UTC_PCT", "total_trees", "medincomeE.y",
                                          "density", "avg_speed",
                                          "prop_arterial", "total_vol",
                                          "SRTS")], use="complete.obs")
correlation_matrix
##                   UTC_PCT total_trees medincomeE.y     density   avg_speed
## UTC_PCT        1.00000000 -0.23691423  -0.02347938 -0.38277089 -0.12954815
## total_trees   -0.23691423  1.00000000   0.37943532  0.24199697 -0.01226230
## medincomeE.y  -0.02347938  0.37943532   1.00000000  0.03464347  0.18965796
## density       -0.38277089  0.24199697   0.03464347  1.00000000  0.22591737
## avg_speed     -0.12954815 -0.01226230   0.18965796  0.22591737  1.00000000
## prop_arterial -0.29783091  0.08077900   0.26803253  0.35580099  0.53343231
## total_vol     -0.37634369  0.28823088   0.27587765  0.50274562  0.46490971
## SRTS          -0.02779615  0.01188117   0.04989265 -0.04852689 -0.03779806
##               prop_arterial   total_vol        SRTS
## UTC_PCT          -0.2978309 -0.37634369 -0.02779615
## total_trees       0.0807790  0.28823088  0.01188117
## medincomeE.y      0.2680325  0.27587765  0.04989265
## density           0.3558010  0.50274562 -0.04852689
## avg_speed         0.5334323  0.46490971 -0.03779806
## prop_arterial     1.0000000  0.71230493  0.10164940
## total_vol         0.7123049  1.00000000  0.09242527
## SRTS              0.1016494  0.09242527  1.00000000
#Run linear regression models

#Full model, non log-transformed crashes
full.mdl <- lm(total_crashes~UTC_PCT+total_trees+log.income+density+avg_speed+prop_arterial+total_vol+SRTS, data = school.data)

#Summary and coefficients
summary(full.mdl)
## 
## Call:
## lm(formula = total_crashes ~ UTC_PCT + total_trees + log.income + 
##     density + avg_speed + prop_arterial + total_vol + SRTS, data = school.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.035  -8.027  -1.913   5.143  60.661 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.905e+01  2.883e+01   0.661 0.509523    
## UTC_PCT       -2.922e-01  8.353e-02  -3.498 0.000582 ***
## total_trees   -2.672e-03  3.967e-03  -0.673 0.501487    
## log.income    -2.500e+00  1.674e+00  -1.493 0.137007    
## density        4.584e-01  2.106e-01   2.176 0.030748 *  
## avg_speed      1.179e+00  1.127e+00   1.046 0.296867    
## prop_arterial  4.476e+00  9.695e+00   0.462 0.644841    
## total_vol      7.457e-05  1.463e-05   5.097 8.26e-07 ***
## SRTS           1.435e-01  2.624e+00   0.055 0.956452    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.68 on 191 degrees of freedom
##   (20 observations deleted due to missingness)
## Multiple R-squared:  0.4578, Adjusted R-squared:  0.4351 
## F-statistic: 20.16 on 8 and 191 DF,  p-value: < 2.2e-16
coefficients(full.mdl)
##   (Intercept)       UTC_PCT   total_trees    log.income       density 
##  1.905445e+01 -2.922229e-01 -2.671699e-03 -2.500252e+00  4.584156e-01 
##     avg_speed prop_arterial     total_vol          SRTS 
##  1.179380e+00  4.475845e+00  7.457405e-05  1.434928e-01
full.mdl.residuals <- residuals(full.mdl) %>%
                      as.data.frame(.)

#Autoplot
full.mdl.plot <- autoplot(full.mdl, which = 1:3, nrow = 3, ncol = 1)

full.mdl.plot

#Distribution of residuals
ggplot(full.mdl.residuals, aes(.)) +
  theme_minimal()+
  labs(title = "Distribution of Residuals") +
  geom_histogram()

#Full model, log-transformed crashes
full.log.mdl <- lm(log_crash~UTC_PCT+total_trees+log.income+density+avg_speed+prop_arterial+total_vol+SRTS, data = school.data)

#Summary and coefficients
summary(full.log.mdl)
## 
## Call:
## lm(formula = log_crash ~ UTC_PCT + total_trees + log.income + 
##     density + avg_speed + prop_arterial + total_vol + SRTS, data = school.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.16551 -0.44020  0.05294  0.51447  1.62740 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.908e+00  1.545e+00   2.530 0.012203 *  
## UTC_PCT       -2.289e-02  4.475e-03  -5.116 7.56e-07 ***
## total_trees   -1.544e-04  2.125e-04  -0.727 0.468397    
## log.income    -2.687e-01  8.969e-02  -2.996 0.003103 ** 
## density        2.232e-02  1.128e-02   1.978 0.049357 *  
## avg_speed      9.140e-02  6.040e-02   1.513 0.131861    
## prop_arterial  1.663e-01  5.193e-01   0.320 0.749147    
## total_vol      3.023e-06  7.837e-07   3.858 0.000156 ***
## SRTS          -1.059e-01  1.406e-01  -0.753 0.452155    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7328 on 191 degrees of freedom
##   (20 observations deleted due to missingness)
## Multiple R-squared:  0.4327, Adjusted R-squared:  0.4089 
## F-statistic: 18.21 on 8 and 191 DF,  p-value: < 2.2e-16
coefficients(full.log.mdl)
##   (Intercept)       UTC_PCT   total_trees    log.income       density 
##  3.908286e+00 -2.289305e-02 -1.544026e-04 -2.686638e-01  2.231789e-02 
##     avg_speed prop_arterial     total_vol          SRTS 
##  9.139601e-02  1.663020e-01  3.023418e-06 -1.059061e-01
exp(coefficients(full.log.mdl))
##   (Intercept)       UTC_PCT   total_trees    log.income       density 
##    49.8135089     0.9773670     0.9998456     0.7644002     1.0225688 
##     avg_speed prop_arterial     total_vol          SRTS 
##     1.0957028     1.1809297     1.0000030     0.8995091
#Autoplot
full.log.mdl.plot <- autoplot(full.log.mdl, which = 1:3, nrow = 3, ncol = 1)

full.log.mdl.plot

#Distribution of residuals
full.log.mdl.residuals <- residuals(full.log.mdl) %>%
  as.data.frame(.)

ggplot(full.log.mdl.residuals, aes(.)) +
  theme_minimal()+
  labs(title = "Distribution of Residuals (log model)") +
  geom_histogram()

#Check VIF
full.log.mdl.vif <- vif(full.log.mdl)

full.log.mdl.vif
##       UTC_PCT   total_trees    log.income       density     avg_speed 
##      1.268390      1.367205      1.344878      1.503438      1.478916 
## prop_arterial     total_vol          SRTS 
##      2.417257      2.709501      1.038390
#Full model, log-transformed crashes, major or fatal injuries 
full.inj.mdl <- lm(prop_majfat~UTC_PCT+total_trees+log.income+density+avg_speed+prop_arterial+total_vol+SRTS, data = school.data)

#Summary and coefficients
summary(full.inj.mdl)
## 
## Call:
## lm(formula = prop_majfat ~ UTC_PCT + total_trees + log.income + 
##     density + avg_speed + prop_arterial + total_vol + SRTS, data = school.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22414 -0.09138 -0.01255  0.05309  0.52768 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    4.272e-01  2.498e-01   1.710  0.08886 . 
## UTC_PCT        1.709e-03  7.236e-04   2.362  0.01919 * 
## total_trees    2.783e-05  3.437e-05   0.810  0.41901   
## log.income    -4.534e-02  1.450e-02  -3.126  0.00205 **
## density       -2.546e-04  1.825e-03  -0.140  0.88915   
## avg_speed      6.934e-03  9.767e-03   0.710  0.47862   
## prop_arterial  3.342e-02  8.398e-02   0.398  0.69113   
## total_vol     -6.410e-08  1.267e-07  -0.506  0.61358   
## SRTS           1.291e-03  2.273e-02   0.057  0.95478   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1185 on 191 degrees of freedom
##   (20 observations deleted due to missingness)
## Multiple R-squared:  0.09386,    Adjusted R-squared:  0.05591 
## F-statistic: 2.473 on 8 and 191 DF,  p-value: 0.01425
coefficients(full.inj.mdl)
##   (Intercept)       UTC_PCT   total_trees    log.income       density 
##  4.271520e-01  1.708993e-03  2.783290e-05 -4.534424e-02 -2.546412e-04 
##     avg_speed prop_arterial     total_vol          SRTS 
##  6.933613e-03  3.341776e-02 -6.410189e-08  1.290871e-03
#Autoplot
full.mdl.inj.plot <- autoplot(full.inj.mdl, which = 1:3, nrow = 3, ncol = 1)

full.mdl.inj.plot

#Distribution of residuals
full.inj.mdl.residuals <- residuals(full.inj.mdl) %>%
  as.data.frame(.)

ggplot(full.inj.mdl.residuals, aes(.)) +
  theme_minimal()+
  labs(title = "Distribution of Residuals") +
  geom_histogram()

#Check VIF
full.inj.mdl.vif <- vif(full.inj.mdl)

full.inj.mdl.vif
##       UTC_PCT   total_trees    log.income       density     avg_speed 
##      1.268390      1.367205      1.344878      1.503438      1.478916 
## prop_arterial     total_vol          SRTS 
##      2.417257      2.709501      1.038390
#Full model, log-transformed crashes, interaction between tree coverage and arterial roads
full.log.mdl.int <- lm(log_crash~UTC_PCT*prop_arterial+total_trees+log.income+density+avg_speed+total_vol+SRTS, data = school.data)

#Summary and coefficients
summary(full.log.mdl.int)
## 
## Call:
## lm(formula = log_crash ~ UTC_PCT * prop_arterial + total_trees + 
##     log.income + density + avg_speed + total_vol + SRTS, data = school.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.16135 -0.45506  0.06426  0.49811  1.59420 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.653e+00  1.638e+00   2.230 0.026923 *  
## UTC_PCT               -1.954e-02  8.365e-03  -2.335 0.020561 *  
## prop_arterial          4.869e-01  8.521e-01   0.571 0.568368    
## total_trees           -1.503e-04  2.131e-04  -0.705 0.481492    
## log.income            -2.644e-01  9.033e-02  -2.927 0.003845 ** 
## density                2.129e-02  1.151e-02   1.849 0.066028 .  
## avg_speed              9.696e-02  6.164e-02   1.573 0.117382    
## total_vol              3.033e-06  7.855e-07   3.861 0.000155 ***
## SRTS                  -1.045e-01  1.409e-01  -0.742 0.459274    
## UTC_PCT:prop_arterial -1.377e-02  2.897e-02  -0.475 0.635193    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7343 on 190 degrees of freedom
##   (20 observations deleted due to missingness)
## Multiple R-squared:  0.4334, Adjusted R-squared:  0.4065 
## F-statistic: 16.15 on 9 and 190 DF,  p-value: < 2.2e-16
coefficients(full.log.mdl.int)
##           (Intercept)               UTC_PCT         prop_arterial 
##          3.653121e+00         -1.953709e-02          4.869188e-01 
##           total_trees            log.income               density 
##         -1.503141e-04         -2.643540e-01          2.128525e-02 
##             avg_speed             total_vol                  SRTS 
##          9.696373e-02          3.032734e-06         -1.044805e-01 
## UTC_PCT:prop_arterial 
##         -1.376659e-02
exp(coefficients(full.log.mdl.int))
##           (Intercept)               UTC_PCT         prop_arterial 
##            38.5949361             0.9806525             1.6272945 
##           total_trees            log.income               density 
##             0.9998497             0.7677017             1.0215134 
##             avg_speed             total_vol                  SRTS 
##             1.1018204             1.0000030             0.9007923 
## UTC_PCT:prop_arterial 
##             0.9863277
#Autoplot
full.log.mdl.int.plot <- autoplot(full.log.mdl.int, which = 1:3, nrow = 3, ncol = 1)

full.log.mdl.int.plot

#Distribution of residuals
full.log.mdl.int.residuals <- residuals(full.log.mdl.int) %>%
  as.data.frame(.)

ggplot(full.log.mdl.int.residuals, aes(.)) +
  theme_minimal()+
  labs(title = "Distribution of Residuals (interaction)") +
  geom_histogram()

Scatter plot

ggplot(school.data, aes(x=UTC_PCT, y = log_crash)) +
  geom_point(fill = "darkgrey", size = 3, alpha = 0.5) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Percent Urban Tree Coverage and \nPedestrain Involved Crashes in School Radius (log)", 
       x = "Percent Urban Tree Coverage", 
       y = "Total Number of Crashes (log)") +
  scale_x_continuous(breaks = seq(0,80,10)) +
  scale_y_continuous(breaks = seq(0,5,0.5)) +
  geom_smooth(method = "lm", se = FALSE, alpha = 0.7)

ggplot(school.data, aes(x=UTC_PCT, y = prop_majfat)) +
  geom_point(fill = "darkgrey", size = 3, alpha = 0.5) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(linetype = "solid")
  ) +
  labs(title = "Percent Urban Tree Coverage and \nProportion of Major or Fatal Injuries",
       x = "Percent Urban Tree Coverage", 
       y = "Proportion of Major or Fatal Pedestrian Injuries") +
  scale_x_continuous(breaks = seq(0,80,10)) +
  scale_y_continuous(breaks = seq(0,1,0.1)) +
  geom_smooth(method = "lm", se = FALSE, alpha = 0.7)

Leaflet Map

#Create labels for schools and crashes
school.labels <- paste("<b>", school.data$NAME.x, "</b>", "<br/>", 
                       "Tree Coverage Percent:",school.data$UTC_PCT,"%", "<br/>",
                       "Total Crashes: ",school.data$total_crashes, "<br/>") %>%
  lapply(htmltools::HTML)
 
#Color coding crashes
crash.bins <- c(0,1,2,3,4,5)
crash.color <- colorBin("Purples", bins = crash.bins)

#Color coding coverage
coverage.bins <- c(0,15,30,45,75)
coverage.color <- colorBin("Greens", bins = coverage.bins)

#Color coding median income
income.bins <- c(0,20000,40000,60000,80000, 100000, 150000, 200000, 251000)
income.color <- colorBin("Blues", bins = income.bins)

#Color coding traffic volume
traffic.bins <- c(0,50000,100000,150000,200000,250000,300000,350000, 400000, 650000)
traffic.color <- colorBin("Reds", bins = traffic.bins)

#Color coding proportion arterial
art.bins <- c(0, 0.2, 0.4, 0.6, 0.7, 0.8, 1)
art.color <- colorBin("Oranges", bins = art.bins)

#Create map with school location, radius, and crashes
school.map <- leaflet() %>%
  addProviderTiles(provider = "CartoDB.Positron") %>%
  setView( lng = -77.0369
           , lat = 38.9072
           , zoom = 11) %>%
  addMapPane("points", zIndex = 410) %>%
  #Crashes
  addPolygons(data = school.radius, label = school.labels, stroke = TRUE, fillColor = ~crash.color(school.data$log_crash), color = "#413452", fillOpacity = 0.75,
              highlight = highlightOptions(weight = 5, color = "white",bringToFront = TRUE), group = "Crashes") %>%
  addLegend(pal = crash.color, values = school.data$log_crash, title = "Total Number of Crashes (log)", position = "bottomright", group = "Crashes") %>%
  #Coverage
  addPolygons(data = school.radius, label = school.labels, stroke = TRUE, fillColor = ~coverage.color(school.data$UTC_PCT), color = "darkgreen", fillOpacity = 0.75,
              highlight = highlightOptions(weight = 5, color = "white",bringToFront = TRUE), group = "Tree Coverage") %>%
  addLegend(pal = coverage.color, values = school.data$UTC_PCT, title = "Percent Urban Tree Coverage", position = "bottomright", group = "Tree Coverage") %>%
  #Median Income
  addPolygons(data = school.radius, label = school.labels, stroke = TRUE, fillColor = ~income.color(school.data$medincomeE.y), color = "darkblue", fillOpacity = 0.75,
              highlight = highlightOptions(weight = 5, color = "white",bringToFront = TRUE), group = "Median Income") %>%
  addLegend(pal = income.color, values = school.data$medincomeE.y, title = "Median Income", position = "bottomright", group = "Median Income") %>%
  #Traffic Volume
  addPolygons(data = school.radius, label = school.labels, stroke = TRUE, fillColor = ~traffic.color(school.data$total_vol), color = "darkred", fillOpacity = 0.75,
              highlight = highlightOptions(weight = 5, color = "white",bringToFront = TRUE), group = "Traffic Volume") %>%
  addLegend(pal = traffic.color, values = school.data$total_vol, title = "Average Traffic Volume by Day", position = "bottomright", group = "Traffic Volume") %>%
  #Proportion Arterial
  addPolygons(data = school.radius, label = school.labels, stroke = TRUE, fillColor = ~art.color(school.data$prop_arterial), color = "darkorange", fillOpacity = 0.75,
              highlight = highlightOptions(weight = 5, color = "white",bringToFront = TRUE), group = "Proportion Arterial") %>%
  addLegend(pal = art.color, values = school.data$prop_arterial, title = "Proportion of Arterial Roads", position = "bottomright", group = "Proportion Arterial") %>%
#Crash locations
  addCircleMarkers(data = crash_filter, radius = 1, color = "black", group = "Crash Locations", options = pathOptions(pane = "points")) %>%
#Overlay groups
  addLayersControl(overlayGroups = c("Crashes", "Crash Locations", "Median Income", "Tree Coverage", "Traffic Volume", "Proportion Arterial")) %>%
  hideGroup(c("Crashes", "Tree Coverage", "Median Income", "Traffic Volume", "Proportion Arterial"))

school.map